LoL winner predictor¶

Introduction¶

image.png

image.png

image.png

image.png

Prepare the data¶

Open the data¶

In [2]:
file = '../raw_data/full_dump.json'
with open(file) as data_file:    
    data = json.load(data_file) 

data_keys = list(data.keys())
random.sample(data_keys, 5)
Out[2]:
['NA Academy League/2018 Season/Summer Playoffs/Scoreboards/Semifinals and Finals_3_4',
 'Circuito de Leyendas Sur/2017 Season/Opening Season/Scoreboards/Week 6_4_2',
 'Challengers Korea/2020 Season/Summer Season/Scoreboards/Week 7_1_2',
 'Ultraliga/Season 5/Scoreboards/Week 6_6_1',
 '2016 International Wildcard Qualifier/Scoreboards/4-6_6_1']
In [4]:
f'input data: {len(data)}'
Out[4]:
'input data: 35320'

Normalize the data¶

In [5]:
#Normalize means to expand the nested keys in the JSON file and the subsequent DataFrame.
##Normalize the complete dataframe and make a copy
df_normalized_teams = pd.json_normalize(data.values())
df_teams = df_normalized_teams.copy()
df_teams.sample(2)
Out[5]:
id start patch winner duration picks_bans teams.BLUE.name teams.BLUE.total_turret_kills teams.BLUE.total_inhibitor_kills teams.BLUE.total_rift_herald_kills ... teams.BLUE.side teams.BLUE.players teams.RED.name teams.RED.total_turret_kills teams.RED.total_inhibitor_kills teams.RED.total_rift_herald_kills teams.RED.total_dragon_kills teams.RED.total_baron_kills teams.RED.side teams.RED.players
35189 CLS/2017 Season/Preseason Tournament/Scoreboar... None 6.21 RED 2104 [] Rebirth eSports 3 0 0 ... BLUE [{'name': 'MANTARRAYA', 'id': 180979, 'role': ... Last Kings 11 3 0 4 2 RED [{'name': 'Nipphu', 'id': 185761, 'role': 'TOP...
14850 LCS/2019 Season/Summer Season/Scoreboards/Week... None 9.12 BLUE 1690 [{'champion_name': 'Rumble', 'is_ban': True}, ... Cloud9 11 3 1 ... BLUE [{'name': 'Kumo', 'id': 214202, 'role': 'TOP',... Clutch Gaming 0 0 0 1 0 RED [{'name': 'Huni', 'id': 165378, 'role': 'TOP',...

2 rows × 22 columns

In [6]:
def get_patch_year(s):
    return int(s.split('.')[0]) + 2010

#return the year the game was played
df_teams['year'] = df_teams.patch.apply(get_patch_year)
df_teams[['patch', 'year']].sample(2)
Out[6]:
patch year
17489 9.19 2019
27693 11.2 2021
In [7]:
#Make a dataframe dedicated to the team BLUE
df_normalized_BLUE = pd.json_normalize(df_normalized_teams['teams.BLUE.players'].explode())
df_BLUE = df_normalized_BLUE.copy()
df_BLUE.head(5)
Out[7]:
name id role champion_name champion_id gold_15 kills_assists_15 deaths_15 total_gold total_cs total_kills total_monster_kills total_assists total_deaths total_damage_taken total_damage_dealt win side
0 Kikis 172122.0 TOP Nautilus 111 5326.0 8.0 1.0 12504 233 2 0 6 1 18220 156270 True BLUE
1 Broxah 193072.0 JGL Lee Sin 64 5261.0 11.0 1.0 12352 153 4 132 7 1 22212 152183 True BLUE
2 Nisqy 185791.0 MID Syndra 134 6009.0 13.0 1.0 13393 229 5 6 8 1 10647 177681 True BLUE
3 MrRallez 183407.0 BOT Jhin 202 5304.0 11.0 0.0 13969 315 2 5 9 0 9758 228328 True BLUE
4 Klaj 171882.0 SUP Karma 43 2767.0 12.0 0.0 9740 38 1 0 11 0 11917 34299 True BLUE
In [8]:
#Make a dataframe dedicated to the team RED
df_normalized_RED = pd.json_normalize(df_normalized_teams['teams.RED.players'].explode())
df_RED = df_normalized_RED.copy()
df_RED.head(5)
Out[8]:
name id role champion_name champion_id gold_15 kills_assists_15 deaths_15 total_gold total_cs total_kills total_monster_kills total_assists total_deaths total_damage_taken total_damage_dealt win side
0 Phones 193289.0 TOP Maokai 57 4528.0 2.0 7.0 9611 190 1 13 1 7 41065 111536 False RED
1 Obvious 187241.0 JGL Rengar 107 4728.0 2.0 1.0 9640 174 0 126 2 1 27879 147035 False RED
2 MagiFelix 181359.0 MID Ryze 13 4893.0 2.0 3.0 11840 301 0 2 2 3 16013 205899 False RED
3 Sedrion 197437.0 BOT Varus 110 5133.0 2.0 1.0 12010 283 2 1 0 1 10370 184927 False RED
4 Noxiak 185879.0 SUP Nami 267 2521.0 2.0 2.0 7348 16 0 0 2 2 13815 15418 False RED
In [9]:
get_index = df_normalized_teams['id'].tolist()
index_preproc = np.asarray([[index] * 5 for index in get_index])
index_teams = index_preproc.reshape(len(df_normalized_teams) * 5).tolist()
df_RED['game_id'] = index_teams
df_BLUE['game_id'] = index_teams

df_BLUE[df_BLUE['game_id'] == '2016 International Wildcard Invitational/Scoreboards/Bracket Stage_1_1']
Out[9]:
name id role champion_name champion_id gold_15 kills_assists_15 deaths_15 total_gold total_cs total_kills total_monster_kills total_assists total_deaths total_damage_taken total_damage_dealt win side game_id
165535 Smurf 197966.0 TOP Trundle 48 5175.0 8.0 3.0 15494 294 0 15 8 3 36337 241203 True BLUE 2016 International Wildcard Invitational/Score...
165536 PvPStejos 194522.0 JGL Graves 104 4865.0 11.0 2.0 16049 177 6 148 5 2 25664 231200 True BLUE 2016 International Wildcard Invitational/Score...
165537 Kira 172113.0 MID Lissandra 127 5533.0 14.0 2.0 16549 325 3 25 11 2 25513 252607 True BLUE 2016 International Wildcard Invitational/Score...
165538 Onesh0tiq 188541.0 BOT Lucian 236 5722.0 13.0 1.0 18339 356 6 15 7 1 15931 251707 True BLUE 2016 International Wildcard Invitational/Score...
165539 Likkrit 179739.0 SUP Tahm Kench 223 3386.0 10.0 2.0 12309 80 1 6 9 2 23811 56257 True BLUE 2016 International Wildcard Invitational/Score...
In [10]:
df_RED[df_RED['game_id'] == '2016 International Wildcard Invitational/Scoreboards/Bracket Stage_1_1']
Out[10]:
name id role champion_name champion_id gold_15 kills_assists_15 deaths_15 total_gold total_cs total_kills total_monster_kills total_assists total_deaths total_damage_taken total_damage_dealt win side game_id
165535 Yang 205635.0 TOP Maokai 57 4891.0 9.0 2.0 12994 260 1 12 8 2 26978 182014 False RED 2016 International Wildcard Invitational/Score...
165536 Revolta 195157.0 JGL Kindred 203 4730.0 6.0 3.0 12638 178 1 122 5 3 27564 195085 False RED 2016 International Wildcard Invitational/Score...
165537 tockers 201599.0 MID Ekko 245 5159.0 9.0 2.0 13866 285 5 10 4 2 31698 215422 False RED 2016 International Wildcard Invitational/Score...
165538 micaO 182405.0 BOT Jinx 222 5994.0 9.0 5.0 15322 356 3 14 6 5 19697 249792 False RED 2016 International Wildcard Invitational/Score...
165539 Jockster 169596.0 SUP Thresh 412 3155.0 6.0 4.0 8906 36 0 0 6 4 18449 20745 False RED 2016 International Wildcard Invitational/Score...
In [11]:
#Format the data into a df that simulates the one we will receive from the web
def rearrange_df(df):
    df_top = df[['champion_id', 'game_id']][df.role == 'TOP']
    df_jgl = df[['champion_id', 'game_id']][df.role == 'JGL']
    df_top_jgl = pd.merge(df_top.rename(columns={'champion_id': 'TOP'}), df_jgl.rename(columns={'champion_id': 'JGL'}), on='game_id')

    df_bot = df[['champion_id', 'game_id']][df.role == 'BOT']
    df_mid = df[['champion_id', 'game_id']][df.role == 'MID']
    df_bot_mid = pd.merge(df_bot.rename(columns={'champion_id': 'BOT'}), df_mid.rename(columns={'champion_id': 'MID'}), on='game_id')

    df_top_jgl_bot_mid = pd.merge(df_top_jgl, df_bot_mid, on='game_id')

    df_sup = df[['champion_id', 'game_id']][df.role == 'SUP']
    df_top_jgl_bot_mid_sup = pd.merge(df_top_jgl_bot_mid, df_sup.rename(columns={'champion_id': 'SUP'}), on='game_id')
    return df_top_jgl_bot_mid_sup
In [12]:
#Format the data into a df that simulates the one we will received from the web

df_blue = rearrange_df(df_BLUE)
df_red = rearrange_df(df_RED)
data = pd.merge(df_blue, df_red, on='game_id')
df_teams['game_id'] = df_teams['id']

#For the blue side the champions are indexed as _x, red side champions are indexed as _y
full_data = pd.merge(data, df_teams[['winner', 'game_id']], on='game_id')
full_data
Out[12]:
TOP_x game_id JGL_x BOT_x MID_x SUP_x TOP_y JGL_y BOT_y MID_y SUP_y winner
0 111 EU Challenger Series/2017 Season/Spring Qualif... 64 202 134 43 57 107 110 13 267 BLUE
1 50 EU Challenger Series/2017 Season/Spring Qualif... 421 81 126 43 98 56 22 134 90 RED
2 111 EU Challenger Series/2017 Season/Spring Qualif... 107 22 61 201 78 121 81 134 43 RED
3 78 EU Challenger Series/2017 Season/Spring Qualif... 164 202 61 90 68 421 110 7 412 BLUE
4 78 EU Challenger Series/2017 Season/Spring Qualif... 2 110 134 267 114 421 15 105 43 BLUE
... ... ... ... ... ... ... ... ... ... ... ... ...
35315 57 IEM Season 11/Gyeonggi/Scoreboards/Playoffs_2_3 421 202 134 432 78 60 81 13 43 RED
35316 78 IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_1 60 81 69 201 57 421 15 61 43 RED
35317 78 IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_2 121 22 13 412 111 421 81 61 43 BLUE
35318 78 IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_3 121 81 69 143 85 421 202 61 21 BLUE
35319 57 IEM Season 11/Gyeonggi/Scoreboards/Playoffs_3_4 64 81 112 201 78 121 202 69 43 BLUE

35320 rows × 12 columns

In [13]:
'''Quick overlook of the data (inpute, scale, encode and balance!)'''
full_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 35320 entries, 0 to 35319
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   TOP_x    35320 non-null  int64 
 1   game_id  35320 non-null  object
 2   JGL_x    35320 non-null  int64 
 3   BOT_x    35320 non-null  int64 
 4   MID_x    35320 non-null  int64 
 5   SUP_x    35320 non-null  int64 
 6   TOP_y    35320 non-null  int64 
 7   JGL_y    35320 non-null  int64 
 8   BOT_y    35320 non-null  int64 
 9   MID_y    35320 non-null  int64 
 10  SUP_y    35320 non-null  int64 
 11  winner   35320 non-null  object
dtypes: int64(10), object(2)
memory usage: 3.5+ MB

Train a logit and test the data¶

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn import set_config; set_config(display='diagram')

from transformers import SynergyFeature, RoleFeature, ChampionWinrateFeature

# Paralellize column transformers
preproc = ColumnTransformer([
    ('TOP_rate', RoleFeature('TOP'), ['TOP_x', 'TOP_y']),
    ('SUP_rate', RoleFeature('SUP'), ['SUP_x', 'SUP_y']),
    ('MID_rate', RoleFeature('MID'), ['MID_x', 'MID_y']),
    ('BOT_rate', RoleFeature('BOT'), ['BOT_x', 'BOT_y']),
    ('JGL_rate', RoleFeature('JGL'), ['JGL_x', 'JGL_y'])
    ])

#add model
pipe = make_pipeline(preproc, LogisticRegression(solver='liblinear'))
pipe
Out[15]:
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('TOP_rate',
                                                  RoleFeature(role='TOP'),
                                                  ['TOP_x', 'TOP_y']),
                                                 ('SUP_rate',
                                                  RoleFeature(role='SUP'),
                                                  ['SUP_x', 'SUP_y']),
                                                 ('MID_rate',
                                                  RoleFeature(role='MID'),
                                                  ['MID_x', 'MID_y']),
                                                 ('BOT_rate',
                                                  RoleFeature(role='BOT'),
                                                  ['BOT_x', 'BOT_y']),
                                                 ('JGL_rate',
                                                  RoleFeature(role='JGL'),
                                                  ['JGL_x', 'JGL_y'])])),
                ('logisticregression', LogisticRegression(solver='liblinear'))])
Please rerun this cell to show the HTML repr or trust the notebook.
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('TOP_rate',
                                                  RoleFeature(role='TOP'),
                                                  ['TOP_x', 'TOP_y']),
                                                 ('SUP_rate',
                                                  RoleFeature(role='SUP'),
                                                  ['SUP_x', 'SUP_y']),
                                                 ('MID_rate',
                                                  RoleFeature(role='MID'),
                                                  ['MID_x', 'MID_y']),
                                                 ('BOT_rate',
                                                  RoleFeature(role='BOT'),
                                                  ['BOT_x', 'BOT_y']),
                                                 ('JGL_rate',
                                                  RoleFeature(role='JGL'),
                                                  ['JGL_x', 'JGL_y'])])),
                ('logisticregression', LogisticRegression(solver='liblinear'))])
ColumnTransformer(transformers=[('TOP_rate', RoleFeature(role='TOP'),
                                 ['TOP_x', 'TOP_y']),
                                ('SUP_rate', RoleFeature(role='SUP'),
                                 ['SUP_x', 'SUP_y']),
                                ('MID_rate', RoleFeature(role='MID'),
                                 ['MID_x', 'MID_y']),
                                ('BOT_rate', RoleFeature(role='BOT'),
                                 ['BOT_x', 'BOT_y']),
                                ('JGL_rate', RoleFeature(role='JGL'),
                                 ['JGL_x', 'JGL_y'])])
['TOP_x', 'TOP_y']
RoleFeature(role='TOP')
['SUP_x', 'SUP_y']
RoleFeature(role='SUP')
['MID_x', 'MID_y']
RoleFeature(role='MID')
['BOT_x', 'BOT_y']
RoleFeature(role='BOT')
['JGL_x', 'JGL_y']
RoleFeature(role='JGL')
LogisticRegression(solver='liblinear')

Same role feature¶

image-3.png

In [16]:
def Role_DataFrame(df_BLUE, df_RED):
    #merge the dataframe of the blue and red teams champions
    df_BLUE_RED = pd.merge(left=df_BLUE, right=df_RED, left_on= 'game_id', right_on= 'game_id')
    
    #only keep relevant columns
    df_role = df_BLUE_RED[['champion_id_x', 'role_x', 'role_y', 'champion_id_y', 'win_x', 'game_id']]
    
    #times that a given champion played and won or lost against another champion by role
    champion_vs_champion = pd.DataFrame(df_role[['champion_id_x',
                                                 'role_x', 'role_y', 'champion_id_y', 'win_x']].value_counts())

    #times that a given champion played against another champion by role
    total_champion_vs_champion = pd.DataFrame(df_BLUE_RED[['champion_id_x',
                                                           'role_x', 'role_y', 'champion_id_y']].value_counts())

    #percentage that a champion has won or lost against another champion (win(or lost)/ total times played)
    rate_champion_vs_champion = champion_vs_champion.div(total_champion_vs_champion)
    
    #save it as a .csv file -> no need to compute everytime!
    rate_champion_vs_champion.to_csv('role_winrate_champ_vs_champ.csv')
    return  rate_champion_vs_champion
In [17]:
#sent the matrix handling to a dedicated function in a utils.py file
from utils import get_synergy, get_vs_rate, get_winrate
from sklearn.base import BaseEstimator, TransformerMixin

class RoleFeature(BaseEstimator, TransformerMixin):

    def __init__(self, role):
        #get the role winrate champion vs champion DataFrame
        rate_champion_vs_champion = pd.read_csv('role_winrate_champ_vs_champ.csv',index_col=[0,1,2,3,4])
        self.rate_champion_vs_champion = rate_champion_vs_champion
        self.role = role

    def fit(self, X=None, y=None):
        return self

    def transform(self, X, y=None):
        #Get the winrate of the same role champions
        df = X.apply(lambda z: get_vs_rate(z[0], self.role, z[1], self.rate_champion_vs_champion), axis=1)
        return pd.DataFrame(df)
In [18]:
from sklearn.preprocessing import LabelEncoder

y_train = LabelEncoder().fit(full_data.winner).transform(full_data.winner)
X_train = full_data.drop(['game_id', 'winner'], axis=1)
In [19]:
# Train pipeline
pipe.fit(X_train,y_train)
Out[19]:
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('TOP_rate',
                                                  RoleFeature(role='TOP'),
                                                  ['TOP_x', 'TOP_y']),
                                                 ('SUP_rate',
                                                  RoleFeature(role='SUP'),
                                                  ['SUP_x', 'SUP_y']),
                                                 ('MID_rate',
                                                  RoleFeature(role='MID'),
                                                  ['MID_x', 'MID_y']),
                                                 ('BOT_rate',
                                                  RoleFeature(role='BOT'),
                                                  ['BOT_x', 'BOT_y']),
                                                 ('JGL_rate',
                                                  RoleFeature(role='JGL'),
                                                  ['JGL_x', 'JGL_y'])])),
                ('logisticregression', LogisticRegression(solver='liblinear'))])
Please rerun this cell to show the HTML repr or trust the notebook.
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('TOP_rate',
                                                  RoleFeature(role='TOP'),
                                                  ['TOP_x', 'TOP_y']),
                                                 ('SUP_rate',
                                                  RoleFeature(role='SUP'),
                                                  ['SUP_x', 'SUP_y']),
                                                 ('MID_rate',
                                                  RoleFeature(role='MID'),
                                                  ['MID_x', 'MID_y']),
                                                 ('BOT_rate',
                                                  RoleFeature(role='BOT'),
                                                  ['BOT_x', 'BOT_y']),
                                                 ('JGL_rate',
                                                  RoleFeature(role='JGL'),
                                                  ['JGL_x', 'JGL_y'])])),
                ('logisticregression', LogisticRegression(solver='liblinear'))])
ColumnTransformer(transformers=[('TOP_rate', RoleFeature(role='TOP'),
                                 ['TOP_x', 'TOP_y']),
                                ('SUP_rate', RoleFeature(role='SUP'),
                                 ['SUP_x', 'SUP_y']),
                                ('MID_rate', RoleFeature(role='MID'),
                                 ['MID_x', 'MID_y']),
                                ('BOT_rate', RoleFeature(role='BOT'),
                                 ['BOT_x', 'BOT_y']),
                                ('JGL_rate', RoleFeature(role='JGL'),
                                 ['JGL_x', 'JGL_y'])])
['TOP_x', 'TOP_y']
RoleFeature(role='TOP')
['SUP_x', 'SUP_y']
RoleFeature(role='SUP')
['MID_x', 'MID_y']
RoleFeature(role='MID')
['BOT_x', 'BOT_y']
RoleFeature(role='BOT')
['JGL_x', 'JGL_y']
RoleFeature(role='JGL')
LogisticRegression(solver='liblinear')
In [20]:
from sklearn.model_selection import cross_val_score

# Cross validate pipeline
cross_val_score(pipe, X_train, y_train, cv=2, scoring='accuracy').mean()
Out[20]:
0.5201019252548131
In [ ]: